Load packages

# rm(list = ls())
library(tictoc)  # to monitor time

library(raster) # read raster files
library(rgdal) # Use GDAL functions
library(exactextractr) # Zonal statistics from raster
library(sf) # Working with shapefile layers

library(tidyverse) # data wrangling
library(plyr) # data wrangling
library(dplyr) # data wrangling

library(ggpmisc) # To place geom_text within plot
library(plotly) # Interactive visualization
library(patchwork) # This package for placing multiple plats

print("Packages loaded successfully!")
## [1] "Packages loaded successfully!"

Functions specific to this RMD

#########################################
# Creating function for exponential fit
myexpfit <- function(response_var, pred_var, mydata){
  # print(paste0("Model = ", "log(",response_var, ")~", pred_var))
  model.linear <- lm(paste0("log(",response_var, ")~", pred_var), data = mydata)
   a1 <- exp(model.linear$coefficients[1])
   b1 <- (model.linear$coefficients[2])
   R2 <- summary(model.linear)$r.squared

   myresults <- c(a1, b1, R2)
   return(myresults)
}
#########################################

# My GGPLOT for yield map
my_yield_plot <- function(yield_df){
  fieldtitle <- levels(yield_df$Fieldname)
  # Legend labeling - Splitting the yield range into equal classes
  minyld <- round(min(yield_df$Yield_Mg.ha),0) # minimum yield 
  maxyld <- round(max(yield_df$Yield_Mg.ha),0) # maximum yield
  Nclasses <- 4      # Number of yield classes to split from the whole yield range
  
  mybreaks <- round(seq(minyld, maxyld, len = (Nclasses+1)),0)
  YE_colors <-  c("blue","cyan","lightgreen","yellow","orange", "red")

  yld_plot <- ggplot(yield_df, aes(x = Longitude, y = Latitude)) +
  geom_point(aes(color = Yield_Mg.ha), size = 0.7) +    
      theme_bw() +
      # facet_grid(Field~Year, scales = "free") +
      theme(aspect.ratio = 1.0) +
      # ggtitle("Actual yield (Mg/ha)")+
      facet_wrap(~Fieldname, scales = "free") +
      # theme(legend.position = c(0.85, 0.27))+
      scale_colour_gradientn(colours = YE_colors,  
                             name = "Yield (Mg/ha)", 
                             limits=c(minyld,maxyld),
                             breaks = mybreaks,
                             labels = round(mybreaks, 0)) +
    guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
    theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), 
            axis.title.x = element_blank(),
            axis.title.y = element_blank(),
            axis.text.x = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks = element_blank(), 
            plot.title = element_text(size=14, face = "bold")) +
      theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
            legend.text = element_text(vjust = 0.5),
            strip.text = element_text(face = "bold", size = 10))
  yld_plot
  
}

#########################################
my_pred_plot <- function(sub_dat){
  fieldtitle <- levels(sub_dat$Fieldname)
    # Legend labeling - Splitting the yield range into equal classes
  minyld <- round(min(sub_dat$Pred_Yield),0) # minimum yield 
  maxyld <- round(max(sub_dat$Pred_Yield),0) # maximum yield
  Nclasses <- 4      # Number of yield classes to split from the whole yield range
    
  mybreaks <- round(seq(minyld, maxyld, len = (Nclasses+1)),0)
  YE_colors <-  c("blue","cyan","lightgreen","yellow","orange", "red")

  pred_map <- ggplot(sub_dat, aes(x = Longitude, y = Latitude)) +
        geom_point(aes(color = Pred_Yield), size = 0.7) +    
        theme_bw() +
        # facet_grid(Field~Year, scales = "free") +
        theme(aspect.ratio = 1.0) +
        # ggtitle("Predicted yield (Mg/ha)")+
        facet_wrap(~Fieldname, scales = "free") +
        # theme(legend.position = c(0.85, 0.27))+
        scale_colour_gradientn(colours = YE_colors,  
                               name = "Yield (Mg/ha)", 
                               limits=c(minyld,maxyld),
                               breaks = mybreaks,
                               labels = round(mybreaks, 0)) +
        guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
        theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(), 
              axis.title.x = element_blank(),
              axis.title.y = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks = element_blank(), 
              plot.title = element_text(size=14, face = "bold")) +
        theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
              legend.text = element_text(vjust = 0.5),
              strip.text = element_text(face = "bold", size = 10))
  pred_map
}

#########################################
## Actual and predicted yield map
make_yield_maps <- function(sub_datf){

  sel_dat4 <- sub_datf %>% 
    select(Fieldname, Latitude, Longitude, Yield_Mg.ha, Pred_Yield) %>% 
    gather("yield_type", "Value", 4:5) %>% 
    droplevels()
  
  sel_dat4$yield_type <- recode_factor(factor(sel_dat4$yield_type), Pred_Yield = "Predicted yield", Yield_Mg.ha = "Actual yield")
  
  sel_dat4$yield_type <- factor(sel_dat4$yield_type, levels = c("Actual yield", "Predicted yield"))
  
  minyld <- round(min(sel_dat4$Value),0) # minimum yield 
  maxyld <- round(max(sel_dat4$Value),0) # maximum yield
  Nclasses <- 4      # Number of yield classes to split from the whole yield range
    
  mybreaks <- round(seq(minyld, maxyld, len = (Nclasses+1)),0)
  YE_colors <-  c("blue","cyan","lightgreen","yellow","orange", "red")
  
  yld_plot <- ggplot(sel_dat4, aes(x = Longitude, y = Latitude)) +
    geom_point(aes(color = Value), size = 0.7) +    
        theme_bw() +
        # facet_grid(Field~Year, scales = "free") +
        theme(aspect.ratio = 1.0) +
        # ggtitle("Actual yield (Mg/ha)")+
        facet_wrap(~yield_type, scales = "free") +
        # theme(legend.position = c(0.85, 0.27))+
        scale_colour_gradientn(colours = YE_colors,  
                               name = "Yield (Mg/ha)", 
                               limits=c(minyld,maxyld),
                               breaks = mybreaks,
                               labels = round(mybreaks, 0)) +
      guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
      theme(panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(), 
              axis.title.x = element_blank(),
              axis.title.y = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks = element_blank(), 
              plot.title = element_text(size=14, face = "bold")) +
        theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
              legend.text = element_text(vjust = 0.5),
              strip.text = element_text(face = "bold", size = 10))
  
  yld_plot
}

Load dataset

load("Quantix_deep_learning.RData", verbose = TRUE)
## Loading objects:
##   dat
str(dat)
## 'data.frame':    1270534 obs. of  46 variables:
##  $ Fieldname            : Factor w/ 8 levels "DMD_GH1","PSF_111",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ FlightDate           : Factor w/ 22 levels "2019/06/07","2019/06/09",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ DOY                  : int  158 158 158 158 158 158 158 158 158 158 ...
##  $ Week                 : Factor w/ 14 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Latitude             : num  43 43 43 43 43 ...
##  $ Longitude            : num  -76.6 -76.6 -76.6 -76.6 -76.6 ...
##  $ Yield                : num  182 188 191 194 197 ...
##  $ Yield_Mg.ha          : num  11.4 11.8 12 12.2 12.4 ...
##  $ Type                 : Factor w/ 2 levels "grain","silage": 1 1 1 1 1 1 1 1 1 1 ...
##  $ N                    : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Strip                : Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ rgn_red              : num  18 18 18 18 18 18 18 18 18 18 ...
##  $ rgn_green            : num  16 15 15 15 15 15 15 15 15 16 ...
##  $ rgn_nir              : num  36 36 35 35 35 35 35 35 35 35 ...
##  $ rgb_red              : num  115 115 116 117 118 118 119 118 118 119 ...
##  $ rgb_green            : num  89 89 90 90 91 91 92 91 91 92 ...
##  $ rgb_blue             : num  67 67 68 68 69 69 70 69 69 70 ...
##  $ NDVI                 : num  0.333 0.333 0.321 0.321 0.321 ...
##  $ GNDVI                : num  0.385 0.412 0.4 0.4 0.4 ...
##  $ EVI2                 : num  0.597 0.616 0.59 0.59 0.59 ...
##  $ SR                   : num  2 2 1.94 1.94 1.94 ...
##  $ EXG                  : num  -4 -4 -4 -5 -5 -5 -5 -5 -5 -5 ...
##  $ TGI                  : num  3.28 3.28 3.28 2.89 2.89 2.89 2.89 2.89 2.89 2.89 ...
##  $ Mgmt_zone            : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 2 2 2 2 ...
##  $ Climod_GDD           : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ Climod_Temp_C        : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ Climod_Precip_mm     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ACIS_GDD             : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ ACIS_Temp_C          : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ ACIS_Precip_mm       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NASA_PAR             : num  159 159 159 159 159 ...
##  $ NASA_Temp_C          : num  17.3 17.3 17.3 17.3 17.3 ...
##  $ NASA_SH              : num  7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 7.2 ...
##  $ NASA_RH              : num  59.7 59.7 59.7 59.7 59.7 ...
##  $ NASA_Precip_mm       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ NASA_WS              : num  0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 0.56 ...
##  $ NASA_SurfWet         : num  0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 ...
##  $ NASA_RootWet         : num  0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 0.7 ...
##  $ NASA_ProfWet         : num  0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 0.66 ...
##  $ DEM                  : int  125 125 125 125 125 125 125 125 125 125 ...
##  $ DEM_type_Flat_Area   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DEM_type_Lower_Slope : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 2 2 ...
##  $ DEM_type_Middle_Slope: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DEM_type_Ridge       : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 1 1 1 ...
##  $ DEM_type_Upper_Slope : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DEM_type_Valley      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
r2df_all <- data.frame()

Data preprocessing

Check for missing values

# Check if any columns have missing values 
sapply(dat, anyNA)
##             Fieldname            FlightDate                   DOY 
##                 FALSE                 FALSE                 FALSE 
##                  Week              Latitude             Longitude 
##                 FALSE                 FALSE                 FALSE 
##                 Yield           Yield_Mg.ha                  Type 
##                 FALSE                 FALSE                 FALSE 
##                     N                 Strip               rgn_red 
##                  TRUE                  TRUE                 FALSE 
##             rgn_green               rgn_nir               rgb_red 
##                 FALSE                 FALSE                 FALSE 
##             rgb_green              rgb_blue                  NDVI 
##                 FALSE                 FALSE                 FALSE 
##                 GNDVI                  EVI2                    SR 
##                 FALSE                 FALSE                 FALSE 
##                   EXG                   TGI             Mgmt_zone 
##                 FALSE                 FALSE                  TRUE 
##            Climod_GDD         Climod_Temp_C      Climod_Precip_mm 
##                 FALSE                 FALSE                 FALSE 
##              ACIS_GDD           ACIS_Temp_C        ACIS_Precip_mm 
##                 FALSE                 FALSE                 FALSE 
##              NASA_PAR           NASA_Temp_C               NASA_SH 
##                 FALSE                 FALSE                 FALSE 
##               NASA_RH        NASA_Precip_mm               NASA_WS 
##                 FALSE                 FALSE                 FALSE 
##          NASA_SurfWet          NASA_RootWet          NASA_ProfWet 
##                 FALSE                 FALSE                 FALSE 
##                   DEM    DEM_type_Flat_Area  DEM_type_Lower_Slope 
##                 FALSE                 FALSE                 FALSE 
## DEM_type_Middle_Slope        DEM_type_Ridge  DEM_type_Upper_Slope 
##                 FALSE                 FALSE                 FALSE 
##       DEM_type_Valley 
##                 FALSE
# Names of columns that has missing values
names(dat)[sapply(dat, anyNA)]
## [1] "N"         "Strip"     "Mgmt_zone"

Plot all yield data

g_dat <- dat %>%
  dplyr::filter(Type == "grain") %>%
  droplevels()

my_yield_plot(g_dat)

s_dat <- dat %>%
  dplyr::filter(Type == "silage") %>%
  droplevels()

my_yield_plot(s_dat)

Exploratory data analysis

Distribution plots

# Violin plot

vplt <- ggplot(dat, aes(x = Fieldname, y = Yield_Mg.ha, fill = Fieldname)) +
  geom_violin(alpha = 0.4) +
  theme_bw() +
  facet_wrap(~Type, nrow = 2, scales = "free_y")

vplt

dplt <- ggplot(dat, aes(x = Yield_Mg.ha, y = ..scaled.., fill = Fieldname)) +
  geom_density(alpha = 0.4, size = 1) + 
  theme_bw() +
  facet_wrap(~Type, nrow = 2, scales = "free")

dplt

GPS points vs yield

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
# selfield <- "DMD_GH1"

sub_datf <- dat %>% 
  filter(Week == 1 & Type == "grain") %>% 
  droplevels()

# Legend labeling - Splitting the yield range into equal classes
minyld <- round(min(sub_datf$Yield_Mg.ha),0) # minimum yield 
maxyld <- round(max(sub_datf$Yield_Mg.ha),0) # maximum yield
Nclasses <- 4      # Number of yield classes to split from the whole yield range

mybreaks <- round(seq(minyld, maxyld, len = (Nclasses+1)),0)
YE_colors <-  c("blue","cyan","lightgreen","yellow","orange", "red")

#=====================
lat_plot <- ggplot(sub_datf, aes(x = Latitude, y = Yield_Mg.ha)) +
  geom_point(aes(color = Yield_Mg.ha), size = 0.7) +    
      theme_bw() +
      # facet_grid(Field~Year, scales = "free") +
      theme(aspect.ratio = 1.0) +
      ggtitle("Latitude vs Yield")+
      facet_wrap(~Fieldname, scales = "free") +
      # theme(legend.position = c(0.85, 0.27))+
      scale_colour_gradientn(colours = YE_colors,  
                             name = "Yield (Mg/ha)", 
                             limits=c(minyld,maxyld),
                             breaks = mybreaks,
                             labels = round(mybreaks, 0)) +
    guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
    theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), 
            axis.text.x = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks = element_blank(), 
            plot.title = element_text(size=14, face = "bold")) +
      theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
            legend.text = element_text(vjust = 0.5),
            strip.text = element_text(face = "bold", size = 10))
lat_plot

#=====================
long_plot <- ggplot(sub_datf, aes(x = Longitude, y = Yield_Mg.ha)) +
  geom_point(aes(color = Yield_Mg.ha), size = 0.7) +    
      theme_bw() +
      # facet_grid(Field~Year, scales = "free") +
      theme(aspect.ratio = 1.0) +
      ggtitle("Longitude vs Yield")+
      facet_wrap(~Fieldname, scales = "free") +
      # theme(legend.position = c(0.85, 0.27))+
      scale_colour_gradientn(colours = YE_colors,  
                             name = "Yield (Mg/ha)", 
                             limits=c(minyld,maxyld),
                             breaks = mybreaks,
                             labels = round(mybreaks, 0)) +
    guides(colour = guide_colourbar(barheight = unit(1.8, "in")))+
    theme(panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), 
            axis.text.x = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks = element_blank(), 
            plot.title = element_text(size=14, face = "bold")) +
      theme(legend.title = element_text(face = "bold", size=8, margin = margin(0,0,7,0)),
            legend.text = element_text(vjust = 0.5),
            strip.text = element_text(face = "bold", size = 10))
long_plot

Approach 1

1.a. Use only strip data for exponential model

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
# "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
# Best grain field - DMD_GH1
# Best silage filed - SSF_121, PSF_12

selfield <- "DMD_GH1"
# seltype <- "silage"

sub_dat <- dat %>% 
  filter(!is.na(N) & Fieldname == selfield) %>% 
  droplevels()

my_yield_plot(sub_dat)

# "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
lvl <- levels(sub_dat$Week)
sub_dat$Strip <- as.numeric(sub_dat$Strip)

#==============
# Scatter plot between NDVI and Yield
sct_plt <- ggplot(sub_dat, aes(x = NDVI, y = Yield_Mg.ha)) + 
  geom_point(aes(color = Week)) +
  theme_bw() + 
  theme(aspect.ratio = 1) +
  ggtitle("NDVI vs Yield") +
  xlab("NDVI") + 
  ylab("Yield (Mg/ha)") + 
  xlim(c(0.0, 1.0))

sct_plt

#==============

r2df <- data.frame()
for (i in 1:length(lvl)){
  # print(i)
  wsub_dat <- sub_dat %>% 
    filter(Week == lvl[i]) %>%
    droplevels()
  
  mod <- myexpfit("Yield_Mg.ha", "NDVI", wsub_dat)
  
  ## RMSE calculation ======
  a_fit <- as.numeric(mod[1])
  b_fit <- as.numeric(mod[2])
  rsq <- as.numeric(mod[3])
  
  ssub_dat <- dat %>% 
    filter(Week == lvl[i] & Fieldname == selfield) %>% 
    droplevels() 
    
  ssub_dat$Pred_Yield <-  a_fit*exp(b_fit*ssub_dat$NDVI)

  ssub_dat$error <- (ssub_dat$Pred_Yield - ssub_dat$Yield_Mg.ha)
  sqe <- (ssub_dat$error^2)
  mse <- mean(sqe)
  rmse_week <- sqrt(mse)
  # cat(lvl[i], ") a_fit = ", a_fit, "; b_fit = ", b_fit, ";\t R2 = ", rsq,";\t RMSE = ", rmse_week, "\n")
  # cat("RMSE for", selfield, "on week", lvl[i], "is", rmse, "Mg/ha")
  ## ========================
  
  r2 <- as.numeric(c(lvl[i], mod[1], mod[2], mod[3], rmse_week))
  r2df <- rbind(r2df, r2)
  # print(paste0(i, " = ", mod[3]))
}

colnames(r2df) <- c("Weeks", "a", "b", "R2", "RMSE")
str(r2df)
## 'data.frame':    13 obs. of  5 variables:
##  $ Weeks: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ a    : num  11.13 11.89 16.65 13.53 9.93 ...
##  $ b    : num  0.88 0.636 -0.359 0.216 0.761 ...
##  $ R2   : num  0.03295 0.01578 0.01089 0.00963 0.41566 ...
##  $ RMSE : num  2.29 2.25 2.14 2.14 1.72 ...
r2df$Fieldname <- rep(selfield, length(lvl))
r2df$Approach <- rep("1a", length(lvl))
r2df_all <- rbind(r2df_all, r2df)
#============================
## Pick the best week
best_week <- which.max(r2df$R2)
# best_week <- which.min(r2df$RMSE)
cat("The best week with max R2 is week ", best_week)
## The best week with max R2 is week  6
sub_dat1 <- dat %>% 
  filter(Week == best_week & Fieldname == selfield) %>% 
  droplevels()

a_coeff <- r2df[best_week, 2]
b_coeff <- r2df[best_week, 3]
sub_dat1$Pred_Yield <-  a_coeff*exp(b_coeff*sub_dat1$NDVI)

make_yield_maps(sub_dat1)

# my_yield_plot(sub_dat1) + my_pred_plot(sub_dat1)

sub_dat1$error <- (sub_dat1$Pred_Yield - sub_dat1$Yield_Mg.ha)
mse <- mean(sub_dat1$error^2)
rmse <- sqrt(mse)
cat("RMSE for", selfield, "on week", best_week, "is", rmse, "Mg/ha")
## RMSE for DMD_GH1 on week 6 is 1.577851 Mg/ha
#======================
# Secondary axis plot
d  <- r2df
x  <- 'Weeks'
y1 <- 'R2'
y2 <- 'RMSE'

a            <- range(d[[y1]])
b            <- range(d[[y2]])
scale_factor <- diff(a)/diff(b)
d[[y2]]      <- ((d[[y2]] - b[1]) * scale_factor) + a[1]

trans <- ~ ((. - a[1]) / scale_factor) + b[1]

r2_rmse_plt <- ggplot(d) +
              geom_point(aes_string(x, y1)) + 
              geom_line(aes_string(x, y1)) + 
              geom_point(aes_string(x, y2), col='red') + 
              geom_line(aes_string(x, y2), col='red') +
              ggtitle(selfield) + 
              theme_bw()+
              theme(legend.position = c(0.8, 0.7)) + 
              scale_x_continuous(breaks = seq(1,15, 1))+
              scale_y_continuous(sec.axis = sec_axis(trans=trans, name=y2))

r2_rmse_plt

# 
# ggplotly(r2_rmse_plt)

1.b. Use lower and upper 10% of strip data

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
selfield <- "DMD_GH1"

sub_dat <- dat %>% 
  filter(!is.na(N) & Fieldname == selfield) %>%
  droplevels()
my_yield_plot(sub_dat)

# Evaluate qunatile cutoffs
pert <- 0.10

cutoffs <- quantile(sub_dat$Yield_Mg.ha, c(pert, (1-pert)))
cat("Cutoff for field", selfield, " =", cutoffs)
## Cutoff for field DMD_GH1  = 13.30046 16.16241
# Subset points within cutoff
qsub_dat <- sub_dat %>% 
  filter(Yield_Mg.ha <= cutoffs[[1]] | Yield_Mg.ha >= cutoffs[[2]]) %>% 
  droplevels()

my_yield_plot(qsub_dat)

#==============
# Scatter plot between NDVI and Yield
sct_plt <- ggplot(qsub_dat, aes(x = NDVI, y = Yield_Mg.ha)) + 
  geom_point(aes(color = Week)) +
  theme_bw() + 
  theme(aspect.ratio = 1) +
  xlab("NDVI") + 
  ylab("Yield (Mg/ha)") + 
  xlim(c(0.0, 1.0))

sct_plt

#==============

lvl <- levels(qsub_dat$Week)

r2df <- data.frame()
for (i in 1:length(lvl)){
  # print(i)
  wsub_dat <- qsub_dat %>% 
    filter(Week == lvl[i]) %>%
    droplevels()
  
  mod <- myexpfit("Yield_Mg.ha", "NDVI", wsub_dat)
  
  ## RMSE calculation ======
  a_fit <- as.numeric(mod[1])
  b_fit <- as.numeric(mod[2])
  rsq <- as.numeric(mod[3])
  
  ssub_dat <- dat %>% 
    filter(Week == lvl[i] & Fieldname == selfield) %>% 
    droplevels() 
    
  ssub_dat$Pred_Yield <-  a_fit*exp(b_fit*ssub_dat$NDVI)

  ssub_dat$error <- (ssub_dat$Pred_Yield - ssub_dat$Yield_Mg.ha)
  sqe <- (ssub_dat$error^2)
  mse <- mean(sqe)
  rmse_week <- sqrt(mse)
  # cat(lvl[i], ") a_fit = ", a_fit, "; b_fit = ", b_fit, ";\t R2 = ", rsq,";\t RMSE = ", rmse_week, "\n")
  # cat("RMSE for", selfield, "on week", lvl[i], "is", rmse, "Mg/ha")
  ## ========================
  
  r2 <- as.numeric(c(lvl[i], mod[1], mod[2], mod[3], rmse_week))
  r2df <- rbind(r2df, r2)
  # print(paste0(i, " = ", mod[3]))
}

colnames(r2df) <- c("Weeks", "a", "b", "R2", "RMSE")
str(r2df)
## 'data.frame':    13 obs. of  5 variables:
##  $ Weeks: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ a    : num  3.39 5.95 15.37 10.33 7.41 ...
##  $ b    : num  4.328 2.472 -0.233 0.77 1.289 ...
##  $ R2   : num  0.19989 0.06955 0.00198 0.06774 0.70619 ...
##  $ RMSE : num  5.43 2.54 2.04 1.97 1.86 ...
r2df$Fieldname <- rep(selfield, length(lvl))
r2df$Approach <- rep("1b", length(lvl))
r2df_all <- rbind(r2df_all, r2df)

#==========================================
## Select best week and evaluate RMSE
best_week <- which.max(r2df$R2)
cat("The best week with max R2 is week ", best_week)
## The best week with max R2 is week  6
sub_dat2 <- dat %>% 
  filter(Week == best_week & Fieldname == selfield)

a_coeff <- r2df[best_week, 2]
b_coeff <- r2df[best_week, 3]
sub_dat2$Pred_Yield <-  a_coeff*exp(b_coeff*sub_dat2$NDVI)

make_yield_maps(sub_dat2)

# my_yield_plot(sub_dat2) + my_pred_plot(sub_dat2)

sub_dat2$error <- (sub_dat2$Pred_Yield - sub_dat2$Yield_Mg.ha)
se <- (sub_dat2$error^2)
mse <- mean(se)
rmse <- sqrt(mse)
cat("RMSE for", selfield, "on week", best_week, "is", rmse, "Mg/ha")
## RMSE for DMD_GH1 on week 6 is 1.824235 Mg/ha
#======================
# Secondary axis plot
d  <- r2df
x  <- 'Weeks'
y1 <- 'R2'
y2 <- 'RMSE'

a            <- range(d[[y1]])
b            <- range(d[[y2]])
scale_factor <- diff(a)/diff(b)
d[[y2]]      <- ((d[[y2]] - b[1]) * scale_factor) + a[1]

trans <- ~ ((. - a[1]) / scale_factor) + b[1]

r2_rmse_plt <- ggplot(d) +
              geom_point(aes_string(x, y1)) + 
              geom_line(aes_string(x, y1)) + 
              geom_point(aes_string(x, y2), col='red') + 
              geom_line(aes_string(x, y2), col='red') +
              ggtitle(selfield) + 
              theme_bw()+
              theme(legend.position = c(0.8, 0.7)) + 
              scale_x_continuous(breaks = seq(1,15, 1))+
              scale_y_continuous(sec.axis = sec_axis(trans=trans, name=y2))

r2_rmse_plt

# ggplotly(r2_rmse_plt)

Approach 2

2.a. Use whole data without strip (select N == NA)

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
selfield <- "DMD_GH1"

sub_dat <- dat %>% 
  filter(is.na(N) & Fieldname == selfield) %>% 
  droplevels()

my_yield_plot(sub_dat)

#==============
# Scatter plot between NDVI and Yield
sct_plt <- ggplot(sub_dat, aes(x = NDVI, y = Yield_Mg.ha)) + 
  geom_point(aes(color = Week)) +
  theme_bw() + 
  theme(aspect.ratio = 1) +
  xlab("NDVI") + 
  ylab("Yield (Mg/ha)") + 
  xlim(c(0.0, 1.0))

sct_plt

#==============

lvl <- levels(sub_dat$Week)

r2df <- data.frame()
for (i in 1:length(lvl)){
  # print(i)
  wsub_dat <- sub_dat %>% 
    filter(Week == lvl[i]) %>%
    droplevels()
  
  mod <- myexpfit("Yield_Mg.ha", "NDVI", wsub_dat)
  
  ## RMSE calculation ======
  a_fit <- as.numeric(mod[1])
  b_fit <- as.numeric(mod[2])
  rsq <- as.numeric(mod[3])
  
  ssub_dat <- dat %>% 
    filter(Week == lvl[i] & Fieldname == selfield) %>% 
    droplevels() 
    
  ssub_dat$Pred_Yield <-  a_fit*exp(b_fit*ssub_dat$NDVI)

  ssub_dat$error <- (ssub_dat$Pred_Yield - ssub_dat$Yield_Mg.ha)
  sqe <- (ssub_dat$error^2)
  mse <- mean(sqe)
  rmse_week <- sqrt(mse)
  # cat(lvl[i], ") a_fit = ", a_fit, "; b_fit = ", b_fit, ";\t R2 = ", rsq,";\t RMSE = ", rmse_week, "\n")
  # cat("RMSE for", selfield, "on week", lvl[i], "is", rmse, "Mg/ha")
  ## ========================
  
  r2 <- as.numeric(c(lvl[i], mod[1], mod[2], mod[3], rmse_week))
  r2df <- rbind(r2df, r2)
  # print(paste0(i, " = ", mod[3]))
}

colnames(r2df) <- c("Weeks", "a", "b", "R2", "RMSE")
str(r2df)
## 'data.frame':    13 obs. of  5 variables:
##  $ Weeks: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ a    : num  15.2 17.5 15.4 9.4 9.2 ...
##  $ b    : num  -0.323 -0.717 -0.343 0.956 0.851 ...
##  $ R2   : num  0.00406 0.01652 0.0069 0.07443 0.27984 ...
##  $ RMSE : num  2.06 2.05 2.05 2 1.71 ...
r2df$Fieldname <- rep(selfield, length(lvl))
r2df$Approach <- rep("2a", length(lvl))
r2df_all <- rbind(r2df_all, r2df)
#==========================================
## Select best week and evaluate RMSE
best_week <- which.max(r2df$R2)
cat("The best week with max R2 is week ", best_week)
## The best week with max R2 is week  7
sub_dat3 <- dat %>% 
  filter(Week == best_week & Fieldname == selfield)

a_coeff <- r2df[best_week, 2]
b_coeff <- r2df[best_week, 3]
sub_dat3$Pred_Yield <-  a_coeff*exp(b_coeff*sub_dat3$NDVI)

make_yield_maps(sub_dat3)

# my_yield_plot(sub_dat3) + my_pred_plot(sub_dat3)

sub_dat3$error <- (sub_dat3$Pred_Yield - sub_dat3$Yield_Mg.ha)
mse <- mean(sub_dat3$error^2)
rmse <- sqrt(mse)
cat("RMSE for", selfield, "on week", best_week, "is", rmse, "Mg/ha")
## RMSE for DMD_GH1 on week 7 is 1.568345 Mg/ha
#======================
# Secondary axis plot
d  <- r2df
x  <- 'Weeks'
y1 <- 'R2'
y2 <- 'RMSE'

a            <- range(d[[y1]])
b            <- range(d[[y2]])
scale_factor <- diff(a)/diff(b)
d[[y2]]      <- ((d[[y2]] - b[1]) * scale_factor) + a[1]

trans <- ~ ((. - a[1]) / scale_factor) + b[1]

r2_rmse_plt <- ggplot(d) +
              geom_point(aes_string(x, y1)) + 
              geom_line(aes_string(x, y1)) + 
              geom_point(aes_string(x, y2), col='red') + 
              geom_line(aes_string(x, y2), col='red') +
              ggtitle(selfield) + 
              theme_bw()+
              theme(legend.position = c(0.8, 0.7)) + 
              scale_x_continuous(breaks = seq(1,15, 1))+
              scale_y_continuous(sec.axis = sec_axis(trans=trans, name=y2))

r2_rmse_plt

# ggplotly(r2_rmse_plt)

2.b. Use 10 and 90 percentile of strip data

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66"
selfield <- "DMD_GH1"

sub_dat <- dat %>% 
  filter(is.na(N) & Fieldname == selfield) %>% 
  droplevels()

my_yield_plot(sub_dat)

# Evaluate quantile cutoffs
pert <- 0.10
cutoffs <- quantile(sub_dat$Yield_Mg.ha, c(pert, (1-pert)))
cat("Cutoff for field", selfield, " =", cutoffs)
## Cutoff for field DMD_GH1  = 10.73753 16.14555
# Subset points within cutoff
qsub_dat <- sub_dat %>% 
  filter(Yield_Mg.ha <= cutoffs[[1]] | Yield_Mg.ha >= cutoffs[[2]]) %>% 
  droplevels()

my_yield_plot(qsub_dat)

#==============
# Scatter plot between NDVI and Yield
sct_plt <- ggplot(qsub_dat, aes(x = NDVI, y = Yield_Mg.ha)) + 
  geom_point(aes(color = Week)) +
  theme_bw() + 
  theme(aspect.ratio = 1) +
  xlab("NDVI") + 
  ylab("Yield (Mg/ha)") + 
  xlim(c(0.0, 1.0))

sct_plt

#==============

lvl <- levels(qsub_dat$Week)

r2df <- data.frame()
for (i in 1:length(lvl)){
  # print(i)
  wsub_dat <- qsub_dat %>% 
    filter(Week == lvl[i]) %>%
    droplevels()
  
  mod <- myexpfit("Yield_Mg.ha", "NDVI", wsub_dat)
  
  ## RMSE calculation ======
  a_fit <- as.numeric(mod[1])
  b_fit <- as.numeric(mod[2])
  rsq <- as.numeric(mod[3])
  
  ssub_dat <- dat %>% 
    filter(Week == lvl[i] & Fieldname == selfield) %>% 
    droplevels() 
    
  ssub_dat$Pred_Yield <-  a_fit*exp(b_fit*ssub_dat$NDVI)

  ssub_dat$error <- (ssub_dat$Pred_Yield - ssub_dat$Yield_Mg.ha)
  sqe <- (ssub_dat$error^2)
  mse <- mean(sqe)
  rmse_week <- sqrt(mse)
  cat(lvl[i], ") a_fit = ", a_fit, "; b_fit = ", b_fit, ";\t R2 = ", rsq,";\t RMSE = ", rmse_week, "\n")
  # cat("RMSE for", selfield, "on week", lvl[i], "is", rmse, "Mg/ha")
  ## ========================
  
  r2 <- as.numeric(c(lvl[i], mod[1], mod[2], mod[3], rmse_week))
  r2df <- rbind(r2df, r2)
  # print(paste0(i, " = ", mod[3]))
}
## 1 ) a_fit =  15.22562 ; b_fit =  -0.6268641 ;     R2 =  0.004506561 ;     RMSE =  2.572662 
## 2 ) a_fit =  25.20251 ; b_fit =  -2.013282 ;  R2 =  0.03934784 ;  RMSE =  2.501949 
## 3 ) a_fit =  17.97113 ; b_fit =  -1.004249 ;  R2 =  0.01877287 ;  RMSE =  2.493913 
## 4 ) a_fit =  3.390362 ; b_fit =  3.314632 ;   R2 =  0.2540942 ;   RMSE =  3.395473 
## 5 ) a_fit =  5.633953 ; b_fit =  1.754935 ;   R2 =  0.548074 ;    RMSE =  2.311081 
## 6 ) a_fit =  5.398522 ; b_fit =  1.405825 ;   R2 =  0.6169247 ;   RMSE =  1.869549 
## 7 ) a_fit =  4.295758 ; b_fit =  1.487639 ;   R2 =  0.5689611 ;   RMSE =  1.694422 
## 8 ) a_fit =  2.240503 ; b_fit =  2.073151 ;   R2 =  0.395068 ;    RMSE =  1.761064 
## 9 ) a_fit =  1.322938 ; b_fit =  2.585552 ;   R2 =  0.2829638 ;   RMSE =  1.958049 
## 10 ) a_fit =  0.2277545 ; b_fit =  4.616331 ;     R2 =  0.2371498 ;   RMSE =  2.045412 
## 12 ) a_fit =  2.476396 ; b_fit =  1.840895 ;  R2 =  0.03169973 ;  RMSE =  2.481548 
## 13 ) a_fit =  0.005646806 ; b_fit =  8.835749 ;   R2 =  0.4714134 ;   RMSE =  1.758735 
## 14 ) a_fit =  0.02793306 ; b_fit =  7.227301 ;    R2 =  0.3882082 ;   RMSE =  1.901906
colnames(r2df) <- c("Weeks", "a", "b", "R2", "RMSE")
str(r2df)
## 'data.frame':    13 obs. of  5 variables:
##  $ Weeks: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ a    : num  15.23 25.2 17.97 3.39 5.63 ...
##  $ b    : num  -0.627 -2.013 -1.004 3.315 1.755 ...
##  $ R2   : num  0.00451 0.03935 0.01877 0.25409 0.54807 ...
##  $ RMSE : num  2.57 2.5 2.49 3.4 2.31 ...
r2df$Fieldname <- rep(selfield, length(lvl))
r2df$Approach <- rep("2b", length(lvl))
r2df_all <- rbind(r2df_all, r2df)
#==========================================
## Select best week and evaluate RMSE
best_week <- which.max(r2df$R2)
cat("The best week with max R2 is week ", best_week)
## The best week with max R2 is week  6
sub_dat4 <- dat %>% 
  filter(Week == best_week & Fieldname == selfield) %>% 
  droplevels()

a_coeff <- r2df[best_week, 2]
b_coeff <- r2df[best_week, 3]

sub_dat4$Pred_Yield <-  a_coeff*exp(b_coeff*sub_dat4$NDVI)

make_yield_maps(sub_dat4)

# my_yield_plot(sub_dat4) + my_pred_plot(sub_dat4)

sub_dat4$error <- (sub_dat4$Pred_Yield - sub_dat4$Yield_Mg.ha)
mse <- mean(sub_dat4$error^2)
rmse <- sqrt(mse)
cat("RMSE for", selfield, "on week", best_week, "is", rmse, "Mg/ha")
## RMSE for DMD_GH1 on week 6 is 1.869549 Mg/ha
#======================
# Secondary axis plot
d  <- r2df
x  <- 'Weeks'
y1 <- 'R2'
y2 <- 'RMSE'

#-----------------------------------------------------------------------------
# Rescale the second y axis by 
#   - subtracting its minimum value (to set it to start at 0)
#   - scaling so that it has the same range as the 'y1' variable
#   - offsettting it by the minimum value of y1
#-----------------------------------------------------------------------------
a            <- range(d[[y1]])
b            <- range(d[[y2]])
scale_factor <- diff(a)/diff(b)
d[[y2]]      <- ((d[[y2]] - b[1]) * scale_factor) + a[1]

#-----------------------------------------------------------------------------
# Need to define the second axis transformation to be the inverse of the data
# transformation to everything cancels out appropriately
#-----------------------------------------------------------------------------
trans <- ~ ((. - a[1]) / scale_factor) + b[1]

#-----------------------------------------------------------------------------
# tell the y axis to set up a scaled secondary axis with the given transform
#-----------------------------------------------------------------------------
r2_rmse_plt <- ggplot(d) +
              geom_point(aes_string(x, y1)) + 
              geom_line(aes_string(x, y1)) + 
              geom_point(aes_string(x, y2), col='red') + 
              geom_line(aes_string(x, y2), col='red') +
              ggtitle(selfield) + 
              theme_bw()+
              theme(legend.position = c(0.8, 0.7)) + 
              scale_x_continuous(breaks = seq(1,15, 1))+
              scale_y_continuous(sec.axis = sec_axis(trans=trans, name=y2))

r2_rmse_plt

# ggplotly(r2_rmse_plt)

Saving the combined metrics data into CSV

# write.csv(r2df_all, "Exponential_models_March29.csv", row.names = FALSE)

Load the saved CSV

r2_df <- read.csv("Exponential_models_March29.csv")
str(r2_df)
## 'data.frame':    436 obs. of  7 variables:
##  $ Weeks    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ a        : num  11.13 11.89 16.65 13.53 9.93 ...
##  $ b        : num  0.88 0.636 -0.359 0.216 0.761 ...
##  $ R2       : num  0.03295 0.01578 0.01089 0.00963 0.41566 ...
##  $ RMSE     : num  2.29 2.25 2.14 2.14 1.72 ...
##  $ Fieldname: chr  "DMD_GH1" "DMD_GH1" "DMD_GH1" "DMD_GH1" ...
##  $ Approach : chr  "1a" "1a" "1a" "1a" ...
fcols <- c("Fieldname", "Approach")
r2_df[fcols] <- lapply(r2_df[fcols], factor)

R2 performance comparison plot

# "DMD_GH1" "PSF_111" "PSF_12"  "SLS_ABH" "SLS_NS"  "SSF_121" "SSF_202" "SSF_66" 
r2_df$Fieldname <- factor(r2_df$Fieldname, levels = c("DMD_GH1", "SLS_ABH", "SLS_NS", "SSF_121", "SSF_66", "SSF_202", "PSF_111", "PSF_12"))

r2plt <- ggplot(r2_df, aes(x = Weeks, y = R2, color = Approach)) + 
  geom_point(size = 1) +
  geom_line(size = 0.7) + 
  theme_bw() + 
  facet_wrap(~Fieldname, scales = "free_y") + 
  scale_x_continuous(limits = c(1, 15), breaks = seq(1, 15, 2))
  # scale_y_continuous(limits = c(0.0, 1.0), breaks = seq(0.0, 1.0, 0.2))

r2plt

RMSE performance comparison plot

rmseplt <- ggplot(r2_df, aes(x = Weeks, y = RMSE, color = Approach)) + 
  geom_point(size = 1) +
  geom_line(size = 0.7) + 
  theme_bw() + 
  facet_wrap(~Fieldname, scales = "free_y") + 
  scale_x_continuous(limits = c(1, 15), breaks = seq(1, 15, 2))
  # scale_y_continuous(limits = c(0.0, 1.0), breaks = seq(0.0, 1.0, 0.2))

rmseplt

Cumulative distribution plot per CropType

# Subset grain or silage
crop_dat <- dat %>% 
  filter(Type == "grain") %>% 
  droplevels()

g_cum_dist_plot <- ggplot(crop_dat, aes(Yield_Mg.ha)) +
  stat_ecdf(aes(color = Fieldname), geom = "step") +
  theme_bw()

g_cum_dist_plot

# Silage yield cumulative distribution
crop_dat <- dat %>% 
  filter(Type == "silage") %>% 
  droplevels()

s_cum_dist_plot <- ggplot(crop_dat, aes(Yield_Mg.ha)) +
  stat_ecdf(aes(color = Fieldname), geom = "step") +
  theme_bw()

s_cum_dist_plot

# #===========================
# hist_plot <- ggplot(crop_dat, aes(x = Yield_Mg.ha, y = ..scaled.., fill = Fieldname)) +
#   # stat_ecdf(aes(color = Fieldname), geom = "step") +
#   geom_density(aes(color = Fieldname), alpha = 0.2) + 
#   theme_bw()
# 
# hist_plot